set.seed(8675309) # For reproducibility
n <- 100 # Number of observations
# Simulate data
x1 <- rbinom(n, 1, 0.5)
x2 <- rnorm(n, 10, 2)
error <- rnorm(n, 0, 3) # Error term
b0 <- 5
b1 <- 2
b2 <- -0.5
y <- b0 + b1 * x1 + b2 * x2 + error
sim_data <- data.frame(y, x1, x2)The Linear OLS Model
Introduction
This document demonstrates the process of linear regression, starting with data simulation, estimation using R’s lm() function, and then replicating the process using matrix algebra. We’ll work with both simulated and real-world data (Boston housing dataset).
Simulated Data Example
We’ll simulate data based on the following model:
y = b_0 + b_1*x1 + b_2*x2 + error
where:
yis a continuous, positive outcome variable.x1is a binary variable.x2is a continuous variable.
Estimation with lm()
model_lm <- lm(y ~ x1 + x2, data = sim_data)
summary(model_lm)
Call:
lm(formula = y ~ x1 + x2, data = sim_data)
Residuals:
Min 1Q Median 3Q Max
-8.0134 -2.2345 0.3141 2.3522 8.4468
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.7208 1.8970 1.961 0.0527 .
x1 2.7952 0.6710 4.166 6.73e-05 ***
x2 -0.4278 0.1837 -2.329 0.0219 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.353 on 97 degrees of freedom
Multiple R-squared: 0.1874, Adjusted R-squared: 0.1707
F-statistic: 11.19 on 2 and 97 DF, p-value: 4.254e-05
# Create a table of coefficients
lm_coef_table <- knitr::kable(coef(summary(model_lm)), digits = 3, caption = "Coefficients from `lm()`")
lm_coef_table| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 3.721 | 1.897 | 1.961 | 0.053 |
| x1 | 2.795 | 0.671 | 4.166 | 0.000 |
| x2 | -0.428 | 0.184 | -2.329 | 0.022 |
code
#plot predictions in highcharter
# Prediction Plot
pred_plot <- highchart() %>%
hc_add_series(plot_data, "scatter", hcaes(x = x2, y = y), name = "Actual Data (x1=0)") %>%
hc_add_series(data.frame(x2 = x2_pred, pred = predictions), "line", hcaes(x=x2, y = pred), name="Predicted") %>%
hc_title(text = "Regression Predictions (lm)") %>%
hc_xAxis(title = list(text = "x2")) %>%
hc_yAxis(title = list(text = "y"))
pred_plotcode
# Residual Plot
resid_plot <- highchart() %>%
hc_add_series(plot_data, "column", hcaes(x = x2, y = pred - y), name = "Residuals", color="grey") %>%
hc_xAxis(title = list(text = "x2")) %>%
hc_yAxis(title = list(text = "Residuals")) %>%
hc_title(text = "Plot of Residuals (lm)")
resid_plotEstimation with Matrix Algebra
# Create design matrix X and outcome vector y
X <- cbind(1, sim_data$x1, sim_data$x2)
y <- sim_data$y
# Calculate coefficients using matrix algebra
b <- solve(t(X) %*% X) %*% t(X) %*% y
# Calculate standard errors
residuals <- y - X %*% b
sigma_sq <- sum(residuals^2) / (n - ncol(X))
var_b <- sigma_sq * solve(t(X) %*% X)
se_b <- sqrt(diag(var_b))
# Create a table of coefficients
matrix_coef_table <- knitr::kable(data.frame(Estimate = b, `Std. Error` = se_b, row.names = c("Intercept", "x1", "x2")), digits = 3, caption = "Coefficients from Matrix Algebra")
matrix_coef_table| Estimate | Std..Error | |
|---|---|---|
| Intercept | 3.721 | 1.897 |
| x1 | 2.795 | 0.671 |
| x2 | -0.428 | 0.184 |
# Predictions and plots (matrix)
X_pred <- cbind(1, x1_pred, x2_pred)
predictions_matrix <- X_pred %*% b
# Prediction Plot
pred_plot_matrix <- highchart() %>%
hc_add_series(plot_data, "scatter", hcaes(x = x2, y = y), name = "Actual Data (x1=0)") %>%
hc_add_series(data.frame(x2 = x2_pred, pred = predictions_matrix), "line", hcaes(x=x2, y = pred), name="Predicted") %>%
hc_title(text = "Regression Predictions (Matrix)") %>%
hc_xAxis(title = list(text = "x2")) %>%
hc_yAxis(title = list(text = "y"))
pred_plot_matrixresid_plot_matrix <- highchart() %>%
hc_add_series(plot_data, "column", hcaes(x = x2, y = pred - y), name = "Residuals", color="grey") %>%
hc_xAxis(title = list(text = "x2")) %>%
hc_yAxis(title = list(text = "Residuals")) %>%
hc_title(text = "Plot of Residuals (Matrix)")
resid_plot_matrixBoston Housing Data Example
library(MASS)
data(Boston)We estimate this model: medv ~ crim + age + tax + black + dis + ptratio
boston_model <- lm(medv ~ crim + age + tax + black + dis + ptratio, data = Boston)
summary(boston_model)
Call:
lm(formula = medv ~ crim + age + tax + black + dis + ptratio,
data = Boston)
Residuals:
Min 1Q Median 3Q Max
-16.671 -4.151 -1.315 2.270 33.143
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 58.773732 3.721963 15.791 < 2e-16 ***
crim -0.140667 0.046639 -3.016 0.002691 **
age -0.094768 0.017458 -5.428 8.89e-08 ***
tax -0.007048 0.002845 -2.477 0.013576 *
black 0.014560 0.003972 3.666 0.000273 ***
dis -0.922736 0.238417 -3.870 0.000123 ***
ptratio -1.519762 0.166843 -9.109 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 7.179 on 499 degrees of freedom
Multiple R-squared: 0.398, Adjusted R-squared: 0.3907
F-statistic: 54.98 on 6 and 499 DF, p-value: < 2.2e-16
boston_lm_coef_table <- knitr::kable(coef(summary(boston_model)), digits = 3, caption = "Boston Model Coefficients (lm)")
boston_lm_coef_table| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 58.774 | 3.722 | 15.791 | 0.000 |
| crim | -0.141 | 0.047 | -3.016 | 0.003 |
| age | -0.095 | 0.017 | -5.428 | 0.000 |
| tax | -0.007 | 0.003 | -2.477 | 0.014 |
| black | 0.015 | 0.004 | 3.666 | 0.000 |
| dis | -0.923 | 0.238 | -3.870 | 0.000 |
| ptratio | -1.520 | 0.167 | -9.109 | 0.000 |
# Predictions for Boston data (lm)
central_tendencies <- apply(Boston[, c("crim", "age", "tax", "black", "dis")], 2, median)
ptratio_pred <- seq(min(Boston$ptratio), max(Boston$ptratio), length.out = 100)
boston_new_data <- data.frame(crim = central_tendencies["crim"],
age = central_tendencies["age"],
tax = central_tendencies["tax"],
black = central_tendencies["black"],
dis = central_tendencies["dis"],
ptratio = ptratio_pred)
boston_predictions <- predict(boston_model, newdata = boston_new_data)
# Plotting data (using all observations for scatter)
plot_data_boston_lm <- Boston
plot_data_boston_lm$pred <- predict(boston_model)
# Prediction plot
boston_lm_pred_plot <- highchart() %>%
hc_add_series(plot_data_boston_lm, "scatter", hcaes(x = ptratio, y = medv), name = "Actual Data") %>%
hc_add_series(data.frame(ptratio = ptratio_pred, pred = boston_predictions), "line", hcaes(x=ptratio, y = pred), name="Predicted") %>%
hc_title(text = "Boston Housing Predictions (lm)") %>%
hc_xAxis(title = list(text = "ptratio")) %>%
hc_yAxis(title = list(text = "medv"))
boston_lm_pred_plot# Residuals plot
boston_lm_resid_plot <- highchart() %>%
hc_add_series(plot_data_boston_lm, "column", hcaes(x = ptratio, y = pred - medv), name = "Residuals", color="grey") %>%
hc_xAxis(title = list(text = "ptratio")) %>%
hc_yAxis(title = list(text = "Residuals")) %>%
hc_title(text = "Plot of Residuals (lm)")
boston_lm_resid_plotMatrix Algebra with Boston Data
X_boston <- model.matrix(boston_model)
y_boston <- Boston$medv
b_boston <- solve(t(X_boston) %*% X_boston) %*% t(X_boston) %*% y_boston
# ... (Calculate standard errors, t-values, p-values similarly to the simulated data example. Include this code if needed).
boston_matrix_coef_table <- knitr::kable(data.frame(Estimate=b_boston), digits = 3, caption = "Boston Model Coefficients (Matrix)")
boston_matrix_coef_table| Estimate | |
|---|---|
| (Intercept) | 58.774 |
| crim | -0.141 |
| age | -0.095 |
| tax | -0.007 |
| black | 0.015 |
| dis | -0.923 |
| ptratio | -1.520 |
# Predictions for Boston data (Matrix)
X_boston_pred <- cbind(1, central_tendencies["crim"], central_tendencies["age"], central_tendencies["tax"],
central_tendencies["black"], central_tendencies["dis"], ptratio_pred)
boston_predictions_matrix <- X_boston_pred %*% b_boston
# Prediction Plot
boston_matrix_pred_plot <- highchart() %>%
hc_add_series(plot_data_boston_lm, "scatter", hcaes(x = ptratio, y = medv), name = "Actual Data") %>%
hc_add_series(data.frame(ptratio = ptratio_pred, pred = boston_predictions_matrix), "line", hcaes(x = ptratio, y = pred), name = "Predicted") %>%
hc_title(text = "Boston Housing Predictions (Matrix)") %>%
hc_xAxis(title = list(text = "ptratio")) %>%
hc_yAxis(title = list(text = "medv"))
boston_matrix_pred_plot# Residual plot (from lm predictions - for fair comparison)
boston_matrix_resid_plot <- highchart() %>%
hc_add_series(plot_data_boston_lm, "column", hcaes(x = ptratio, y = pred - medv), name = "Residuals", color="grey") %>%
hc_xAxis(title = list(text = "ptratio")) %>%
hc_yAxis(title = list(text = "Residuals")) %>%
hc_title(text = "Plot of Residuals (Matrix)")
boston_matrix_resid_plot